Estimate GFLOPS (iterative computation, fractals)

The dot product approach did not succeed in squeezeing the most of the ALU units in the GPU. Here we will try to do iterative computation in a loop. The difficulty is we have to make sure it is not easy for the compiler to optimize it away (so the results have to be unpredictable).

The simplest algorithm that satisfies this is probably the Julia fractal which consists of hundreds of complex number multiplications and each pixel gets wildly different (chaotic) results. This gives us a lot of arithmetic with barely any load/store operations (and pretty pictures).

I am only counting the instructions in the inner loop here (since we run it 4000 times so the rest does not really matter). It’s surprising how many non-FMA instructions are required to get the job done but we get 74.75 GOPS. Does it add up?

The marketing for this SoC seems to talk about G52 MP4 but the hardware register report it having only 2 cores. The most likely explanation is that it has 2 cores with 3 execution units with 8-way SIMD ALU (note that a G52 could also have a 4-way ALU). Since the ALU executes two operations per cycle (but only one of them may be a multiplication) this gives us 96 operations per clock ✕ 800MHz = 76,8 GOPS (for a maximum of 38,4 multiplication GFLOPS).

The calculation assumed there are completely no overheads and latencies of any kind so our 74.75 GOPS do look pretty good.

Test 6: Julia fractal

Warning

The FMA % numbers printed below are for the whole shader code. Since we have loops, they do not apply and we have to inspect the disassembly, find the loop and manually count the ratio.

wh = 256
def test(wh=wh, localx=1, localy=1, membw=False):
    global intex, outtex, source
    w,h = wh,wh
    if localx * localy > 256: return float('nan')
    outtex = createTexture(w, h, texid=1, fmt=gl.GL_RGBA32F)
    source = f"""
    #version 310 es
    precision highp float;

    layout(local_size_x = {localx}, local_size_y = {localy}) in;
    layout(rgba32f, binding = 1) uniform mediump writeonly image2D img_output;
    layout(location = 2) uniform int iterations;

    void main() {{
      vec2 p = (vec2(gl_GlobalInvocationID.xy) / vec2({w}.,{h}.) - vec2(0.5)) * vec2(3.);
      int r = 0;

      for(int i = 0; i < iterations; i++) {{
        if(dot(p, p) < 10.) {{
          r++;
        }}
        p = vec2(p.x*p.x - p.y*p.y + 0.7885, 2.*p.x*p.y);
      }} 
      
      float n = float(r) / float(iterations) * 4.;
      
      imageStore(img_output, ivec2(gl_GlobalInvocationID.xy),
              vec4(0.5-cos(n*75.0)/2.0,0.5-cos(n* 120.0)/2.0,0.5-cos(n*165.0)/2.0,1.0));
    }}
    """
    if membw:
        print("\n".join([f"{n+1: 5d}  {line}" for n, line in enumerate(source.split('\n'))]))
    computeShader(source)
    ITERS = 1000
    gl.glUniform1i(2, ITERS)
    start = time.perf_counter()
    for i in range(50):
        gl.glDispatchCompute(w//localx, h//localy, 1)
        # make sure writing to image has finished before read
        gl.glMemoryBarrier(gl.GL_SHADER_IMAGE_ACCESS_BARRIER_BIT)
    gl.glFinish()
    elapsed = (time.perf_counter() - start)/50
    MACs = 13*ITERS*w*h

    if membw:
        print(f"{MACs / elapsed / 1e9:.2f} GFLOPS   {w*h*4*4 / elapsed / 1024 / 1024:.2f} MB/s  {elapsed * 1e3:.2f} ms")
    
    return MACs / elapsed / 1e9

test()
gflops = test(localx=16, localy=16, membw=True)
showLastShaderDisassembly()

    1  
    2      #version 310 es
    3      precision highp float;
    4  
    5      layout(local_size_x = 16, local_size_y = 16) in;
    6      layout(rgba32f, binding = 1) uniform mediump writeonly image2D img_output;
    7      layout(location = 2) uniform int iterations;
    8  
    9      void main() {
   10        vec2 p = (vec2(gl_GlobalInvocationID.xy) / vec2(256.,256.) - vec2(0.5)) * vec2(3.);
   11        int r = 0;
   12  
   13        for(int i = 0; i < iterations; i++) {
   14          if(dot(p, p) < 10.) {
   15            r++;
   16          }
   17          p = vec2(p.x*p.x - p.y*p.y + 0.7885, 2.*p.x*p.y);
   18        } 
   19        
   20        float n = float(r) / float(iterations) * 4.;
   21        
   22        imageStore(img_output, ivec2(gl_GlobalInvocationID.xy),
   23                vec4(0.5-cos(n*75.0)/2.0,0.5-cos(n* 120.0)/2.0,0.5-cos(n*165.0)/2.0,1.0));
   24      }
   25      
37.42 GFLOPS   43.92 MB/s  22.77 ms
FMAs: 57.69% (30 / 52)

clause_0:
ds(0) nbb r_uncond ncph 
{
    *NOP t0
    +U32_TO_F32 t1, r60
    *FMA.f32 r0:t0, t1, 0x3b800000 /* 0.003906 */, 0xbf000000 /* -0.500000 */
    +U32_TO_F32 t1, r61
    *FMA.f32 r1:t0, t1, 0x3b800000 /* 0.003906 */, 0xbf000000 /* -0.500000 */
    +NOP t1
    *FMA.f32 r0:t0, r0, 0x40400000 /* 3.000000 */, #0.neg
    +NOP t1
    *FMA.f32 r1:t0, r1, 0x40400000 /* 3.000000 */, #0.neg
    +MOV.i32 r2:t1, 0x00000000 /* 0.000000 */
    *NOP t0
    +MOV.i32 r3:t1, t1
}

clause_6:
ds(0) nbb r_uncond ncph 
{
    *NOP t0
    +ICMP.s32.m1.ge t1, r2, u0.w0
    *NOP t0
    +BRANCHZ.i16.eq t1, t1.h0, clause_11
}

clause_9:
ds(0) nbb 
{
    *NOP t0
    +JUMP t1, clause_21
}

clause_11:
ds(0) nbb ncph 
{
    *NOP t0
    +MOV.i32 t1, r1
    *FMA.f32 r4:t0, r1, t1, #0.neg
    +MOV.i32 t1, r0
    *FMA.f32 r5:t0, r0, t1, t0
    +NOP t1
}

clause_14:
ds(0) nbb r_uncond 
{
    *NOP t0
    +IADD.s32 t1, r3, 0x00000001 /* 0.000000 */
    *FCMP.f32.lt.m1 t0, r5, 0x41200000 /* 10.000000 */
    +MUX.i32 r3:t1, r3, t1, t
    *NOP t0
    +MOV.i32 t1, r0
    *FMA.f32 t0, r0, t1, 0x3f49db23 /* 0.788500 */
    +FADD.f32 r4:t1, t, r4.neg
    *MOV.i32 t0, r0
    +FADD.f32 t1, r0, t
    *FMA.f32 r1:t0, t1, r1, #0.neg
    +NOP t1
    *MOV.i32 t0, r2
    +IADD.s32 r2:t1, t, 0x00000001 /* 0.000000 */
    *MOV.i32 r0:t0, r4
    +JUMP t1, clause_6
}

clause_21:
ds(0) nbb ncph 
{
    *MOV.i32 t0, r3
    +S32_TO_F32 r0:t1, t
    *NOP t0
    +S32_TO_F32 t1, u0.w0
    *NOP t0
    +FRCP.f32 t1, t1
    *FMA.f32 r0:t0, r0, t1, #0.neg
    +NOP t1
}

clause_24:
ds(0) nbb ncph 
{
    *FMA.f32 r1:t0, 0x43960000 /* 300.000000 */, r0, #0.neg
    +NOP t1
    *FMA.f32 r2:t0, t0, 0x3f22f98c /* 0.636620 */, 0x49400000 /* 786432.000000 */
    +FADD.f32 t1, t, 0x49400000 /* 786432.000000 */.neg
    *FMA.f32 r1:t0, t1, 0xbfc90fd0 /* -1.570795 */, r1
    +FSIN_TABLE.u6 r3:t1, t0
    *FMA_RSCALE.f32 t0, t0, t0, #0.neg, 0xffffffff /* -nan */
    +FCOS_TABLE.u6 r2:t1, r2
    *FMA.f32 t0, t0, t1.neg, #0.neg
    +NOP t1
    *FMA.f32.clamp_m1_1 t0, r1, r3.neg, t0
    +NOP t1
    *NOP t0
    +FADD.f32 r1:t1, t0, r2
    *FMA.f32 r2:t0, 0x43f00000 /* 480.000000 */, r0, #0.neg
    +NOP t1
}

clause_31:
ds(0) nbb ncph next_attr 
{
    *FMA.f32 r3:t0, r2, 0x3f22f98c /* 0.636620 */, 0x49400000 /* 786432.000000 */
    +FADD.f32 t1, t, 0x49400000 /* 786432.000000 */.neg
    *FMA.f32 r2:t0, t1, 0xbfc90fd0 /* -1.570795 */, r2
    +FSIN_TABLE.u6 r4:t1, t0
    *FMA_RSCALE.f32 t0, t0, t0, #0.neg, 0xffffffff /* -nan */
    +FCOS_TABLE.u6 r3:t1, r3
    *FMA.f32 t0, t0, t1.neg, #0.neg
    +NOP t1
    *FMA.f32.clamp_m1_1 t0, r2, r4.neg, t0
    +NOP t1
    *FMA.f32 r0:t0, 0x44250000 /* 660.000000 */, r0, #0.neg
    +FADD.f32 r2:t1, t0, r3
    *FMA.f32 r3:t0, t0, 0x3f22f98c /* 0.636620 */, 0x49400000 /* 786432.000000 */
    +NOP t1
    *NOP t0
    +FADD.f32 r4:t1, t0, 0x49400000 /* 786432.000000 */.neg
}

clause_38:
ds(0) nbb attr ncph next_store dwb(0) 
{
    *FMA.f32 r0:t0, r4, 0xbfc90fd0 /* -1.570795 */, r0
    +FSIN_TABLE.u6 r4:t1, r3
    *FMA_RSCALE.f32 t0, t0, t0, #0.neg, 0xffffffff /* -nan */
    +FCOS_TABLE.u6 r3:t1, r3
    *FMA.f32 t0, t0, t1.neg, #0.neg
    +NOP t1
    *FMA.f32.clamp_m1_1 t0, r0, r4.neg, t0
    +NOP t1
    *NOP t0
    +FADD.f32 t1, t0, r3
    *FMA.f32 t0, t1, 0x3f000000 /* 0.500000 */, #0.neg
    +FADD.f32 t1, 0x3f000000 /* 0.500000 */, t.neg
    *MOV.i32 r3:t0, t1
    +MKVEC.v2i16 t1, r60, r61
    *DTSEL_IMM.attribute_1 t0, t1
    +LEA_ATTR_TEX.f32 t1, t, #0.x, #0.x, @r5
}

clause_45:
ds(0) eos store 
{
    *FMA.f32 t0, r1, 0x3f000000 /* 0.500000 */, #0.neg
    +FADD.f32 r1:t1, 0x3f000000 /* 0.500000 */, t.neg
    *FMA.f32 t0, r2, 0x3f000000 /* 0.500000 */, #0.neg
    +FADD.f32 r2:t1, 0x3f000000 /* 0.500000 */, t.neg
    *NOP t0
    +MOV.i32 r4:t1, 0x3f800000 /* 1.000000 */
    *NOP t0
    +ST_CVT.v4 t1, r5, r6, r7, @r1
}

shader11948 - MESA_SHADER_COMPUTE shader: 0 inst, 0 bundles, 0 quadwords, 0 registers, 4 threads, 0 loops, 0:0 spills:fills

downloadTexture(outtex, 256, 256)

r = grid_run(test, localx=[1,2,4,8,16,32,64,128,256], localy=[1,2,4,8,16,32,64,128,256])
grid_show(r, colorfun=lambda x: -np.log(x))

command stream to file pandecode.dump.0208
pandecode: dump command stream to file pandecode.dump.0209
pandecode: dump command stream to file pandecode.dump.0210
pandecode: dump command stream to file pandecode.dump.0211
pandecode: dump command stream to file pandecode.dump.0212
pandecode: dump command stream to file pandecode.dump.0213
pandecode: dump command stream to file pandecode.dump.0214
pandecode: dump command stream to file pandecode.dump.0215
pandecode: dump command stream to file pandecode.dump.0216
pandecode: dump command stream to file pandecode.dump.0217
pandecode: dump command stream to file pandecode.dump.0218
pandecode: dump command stream to file pandecode.dump.0219
pandecode: dump command stream to file pandecode.dump.0220
pandecode: dump command stream to file pandecode.dump.0221
pandecode: dump command stream to file pandecode.dump.0222
pandecode: dump command stream to file pandecode.dump.0223
pandecode: dump command stream to file pandecode.dump.0224
pandecode: dump command stream to file pandecode.dump.0225
pandecode: dump command stream to file pandecode.dump.0226
pandecode: dump command stream to file pandecode.dump.0227
pandecode: dump command stream to file pandecode.dump.0228
pandecode: dump command stream to file pandecode.dump.0229
pandecode: dump command stream to file pandecode.dump.0230
pandecode: dump command stream to file pandecode.dump.0231
pandecode: dump command stream to file pandecode.dump.0232
pandecode: dump command stream to file pandecode.dump.0233
pandecode: dump command stream to file pandecode.dump.0234
pandecode: dump command stream to file pandecode.dump.0235
pandecode: dump command stream to file pandecode.dump.0236
pandecode: dump command stream to file pandecode.dump.0237
pandecode: dump command stream to file pandecode.dump.0238
pandecode: dump command stream to file pandecode.dump.0239
pandecode: dump command stream to file pandecode.dump.0240
pandecode: dump command stream to file pandecode.dump.0241
pandecode: dump command stream to file pandecode.dump.0242
pandecode: dump command stream to file pandecode.dump.0243
pandecode: dump command stream to file pandecode.dump.0244
pandecode: dump command stream to file pandecode.dump.0245
pandecode: dump command stream to file pandecode.dump.0246
pandecode: dump command stream to file pandecode.dump.0247
pandecode: dump command stream to file pandecode.dump.0248
pandecode: dump command stream to file pandecode.dump.0249
pandecode: dump command stream to file pandecode.dump.0250
pandecode: dump command stream to file pandecode.dump.0251
pandecode: dump command stream to file pandecode.dump.0252
pandecode: dump command stream to file pandecode.dump.0253
pandecode: dump command stream to file pandecode.dump.0254
pandecode: dump command stream to file pandecode.dump.0255
pandecode: dump command stream to file pandecode.dump.0256
pandecode: dump command stream to file pandecode.dump.0257
pandecode: dump command stream to file pandecode.dump.0258
pandecode: dump command stream to file pandecode.dump.0259
pandecode: dump command stream to file pandecode.dump.0260
pandecode: dump command stream to file pandecode.dump.0261
pandecode: dump command stream to file pandecode.dump.0262
pandecode: dump command stream to file pandecode.dump.0263
pandecode: dump command stream to file pandecode.dump.0264
pandecode: dump command stream to file pandecode.dump.0265
pandecode: dump command stream to file pandecode.dump.0266
pandecode: dump command stream to file pandecode.dump.0267
pandecode: dump command stream to file pandecode.dump.0268
pandecode: dump command stream to file pandecode.dump.0269
pandecode: dump command stream to file pandecode.dump.0270
pandecode: dump command stream to file pandecode.dump.0271
pandecode: dump command stream to file pandecode.dump.0272
pandecode: dump command stream to file pandecode.dump.0273
pandecode: dump command stream to file pandecode.dump.0274
pandecode: dump command stream to file pandecode.dump.0275
pandecode: dump command stream to file pandecode.dump.0276
pandecode: dump command stream to file pa

Test 7: Julia fractal (4-way unroll)

wh = 256
def test(wh=wh, localx=1, localy=1, membw=False):
    global intex, outtex, source
    w,h = wh,wh
    if localx * localy > 256: return float('nan')
    outtex = createTexture(w, h, texid=1, fmt=gl.GL_RGBA32F)
    source = f"""
    #version 310 es
    precision highp float;

    layout(local_size_x = {localx}, local_size_y = {localy}) in;
    layout(rgba32f, binding = 1) uniform mediump writeonly image2D img_output;
    layout(location = 2) uniform int iterations;

    void main() {{
      vec2 p = (vec2(gl_GlobalInvocationID.xy) / vec2({w}.,{h}.) - vec2(0.5)) * vec2(3.);
      int r = 0;

      for(int i = 0; i < iterations / 4; i++) {{
        if(dot(p, p) < 10.) {{
          r++;
        }}
        p = vec2(p.x*p.x - p.y*p.y + 0.7885, 2.*p.x*p.y);
        if(dot(p, p) < 10.) {{
          r++;
        }}
        p = vec2(p.x*p.x - p.y*p.y + 0.7885, 2.*p.x*p.y);
        if(dot(p, p) < 10.) {{
          r++;
        }}
        p = vec2(p.x*p.x - p.y*p.y + 0.7885, 2.*p.x*p.y);
        if(dot(p, p) < 10.) {{
          r++;
        }}
        p = vec2(p.x*p.x - p.y*p.y + 0.7885, 2.*p.x*p.y);
      }} 
      
      float n = float(r) / float(iterations) * 4.;
      
      imageStore(img_output, ivec2(gl_GlobalInvocationID.xy),
              vec4(0.5-cos(n*75.0)/2.0,0.5-cos(n* 120.0)/2.0,0.5-cos(n*165.0)/2.0,1.0));
    }}
    """
    if membw:
        print("\n".join([f"{n+1: 5d}  {line}" for n, line in enumerate(source.split('\n'))]))
    computeShader(source)
    ITERS = 1000
    gl.glUniform1i(2, ITERS)
    start = time.perf_counter()
    for i in range(50):
        gl.glDispatchCompute(w//localx, h//localy, 1)
        # make sure writing to image has finished before read
        gl.glMemoryBarrier(gl.GL_SHADER_IMAGE_ACCESS_BARRIER_BIT)
    gl.glFinish()
    elapsed = (time.perf_counter() - start)/50
    output = downloadTextureFloat(outtex, w, h)
    MACs = 32*ITERS/4*w*h

    if membw:
        print(f"{MACs / elapsed / 1e9:.2f} GFLOPS   {w*h*4*4 / elapsed / 1024 / 1024:.2f} MB/s  {elapsed * 1e3:.2f} ms")
    
    return MACs / elapsed / 1e9

test()
gflops = test(localx=16, localy=16, membw=True)
showLastShaderDisassembly()

    1  
    2      #version 310 es
    3      precision highp float;
    4  
    5      layout(local_size_x = 16, local_size_y = 16) in;
    6      layout(rgba32f, binding = 1) uniform mediump writeonly image2D img_output;
    7      layout(location = 2) uniform int iterations;
    8  
    9      void main() {
   10        vec2 p = (vec2(gl_GlobalInvocationID.xy) / vec2(256.,256.) - vec2(0.5)) * vec2(3.);
   11        int r = 0;
   12  
   13        for(int i = 0; i < iterations / 4; i++) {
   14          if(dot(p, p) < 10.) {
   15            r++;
   16          }
   17          p = vec2(p.x*p.x - p.y*p.y + 0.7885, 2.*p.x*p.y);
   18          if(dot(p, p) < 10.) {
   19            r++;
   20          }
   21          p = vec2(p.x*p.x - p.y*p.y + 0.7885, 2.*p.x*p.y);
   22          if(dot(p, p) < 10.) {
   23            r++;
   24          }
   25          p = vec2(p.x*p.x - p.y*p.y + 0.7885, 2.*p.x*p.y);
   26          if(dot(p, p) < 10.) {
   27            r++;
   28          }
   29          p = vec2(p.x*p.x - p.y*p.y + 0.7885, 2.*p.x*p.y);
   30        } 
   31        
   32        float n = float(r) / float(iterations) * 4.;
   33        
   34        imageStore(img_output, ivec2(gl_GlobalInvocationID.xy),
   35                vec4(0.5-cos(n*75.0)/2.0,0.5-cos(n* 120.0)/2.0,0.5-cos(n*165.0)/2.0,1.0));
   36      }
   37      
36.76 GFLOPS   70.12 MB/s  14.26 ms
FMAs: 59.15% (42 / 71)

clause_0:
ds(0) nbb r_uncond ncph 
{
    *NOP t0
    +U32_TO_F32 t1, r60
    *FMA.f32 r0:t0, t1, 0x3b800000 /* 0.003906 */, 0xbf000000 /* -0.500000 */
    +U32_TO_F32 t1, r61
    *FMA.f32 r1:t0, t1, 0x3b800000 /* 0.003906 */, 0xbf000000 /* -0.500000 */
    +NOP t1
    *FMA.f32 r0:t0, r0, 0x40400000 /* 3.000000 */, #0.neg
    +NOP t1
    *FMA.f32 r1:t0, r1, 0x40400000 /* 3.000000 */, #0.neg
    +MOV.i32 r2:t1, 0x00000000 /* 0.000000 */
    *NOP t0
    +MOV.i32 r3:t1, t1
}

clause_6:
ds(0) nbb r_uncond ncph 
{
    *NOP t0
    +MOV.i32 t1, 0xffffffff /* -nan */
    *CSEL.s32.gt t0, u0.w0, t1, u0.w0, t1
    +NOP t1
    *CSEL.s32.gt r4:t0, 0x00000001 /* 0.000000 */, t0, t0, 0x00000001 /* 0.000000 */
    +NOP t1
    *NOP t0
    +IABS.s32 t1, u0.w0
    *RSHIFT_OR.i32 t0, t1, #0, 0x00000002 /* 0.000000 */
    +NOP t1
    *IMUL.i32 t0, r4, t0
    +ICMP.s32.m1.ge t1, r2, t
    *NOP t0
    +BRANCHZ.i16.eq t1, t1.h0, clause_14
}

clause_12:
ds(0) nbb 
{
    *NOP t0
    +JUMP t1, clause_36
}

clause_14:
ds(0) nbb ncph 
{
    *NOP t0
    +MOV.i32 r4:t1, r1
}

clause_15:
ds(0) nbb ncph 
{
    *FMA.f32 r4:t0, r1, r4, #0.neg
    +MOV.i32 t1, r0
    *FMA.f32 r5:t0, r0, t1, t0
    +NOP t1
    *NOP t0
    +IADD.s32 t1, r3, 0x00000001 /* 0.000000 */
    *FCMP.f32.lt.m1 t0, r5, 0x41200000 /* 10.000000 */
    +MUX.i32 r3:t1, r3, t1, t
    *NOP t0
    +MOV.i32 t1, r0
    *FMA.f32 t0, r0, t1, 0x3f49db23 /* 0.788500 */
    +FADD.f32 r4:t1, t, r4.neg
    *MOV.i32 t0, r0
    +FADD.f32 t1, r0, t
    *FMA.f32 r0:t0, t1, r1, #0.neg
    +NOP t1
}

clause_22:
ds(0) nbb ncph 
{
    *FMA.f32 r1:t0, r0, r0, #0.neg
    +NOP t1
    *FMA.f32 t0, r4, r4, t0
    +IADD.s32 t1, r3, 0x00000001 /* 0.000000 */
    *FCMP.f32.lt.m1 t0, t0, 0x41200000 /* 10.000000 */
    +MUX.i32 r3:t1, r3, t1, t
    *FMA.f32 r5:t0, r4, r4, 0x3f49db23 /* 0.788500 */
    +FADD.f32 t1, r4, r4
    *FMA.f32 r0:t0, t1, r0, #0.neg
    +NOP t1
    *FMA.f32 r4:t0, t0, t0, #0.neg
    +FADD.f32 r1:t1, r5, r1.neg
    *FMA.f32 t0, t1, t1, t0
    +IADD.s32 t1, r3, 0x00000001 /* 0.000000 */
    *FCMP.f32.lt.m1 t0, t0, 0x41200000 /* 10.000000 */
    +MUX.i32 r3:t1, r3, t1, t
}

clause_29:
ds(0) nbb r_uncond 
{
    *FMA.f32 r5:t0, r1, r1, 0x3f49db23 /* 0.788500 */
    +FADD.f32 t1, r1, r1
    *FMA.f32 r1:t0, t1, r0, #0.neg
    +NOP t1
    *FMA.f32 r0:t0, t0, t0, #0.neg
    +FADD.f32 r4:t1, r5, r4.neg
    *FMA.f32 t0, t1, t1, t0
    +IADD.s32 t1, r3, 0x00000001 /* 0.000000 */
    *FCMP.f32.lt.m1 t0, t0, 0x41200000 /* 10.000000 */
    +MUX.i32 r3:t1, r3, t1, t
    *FMA.f32 t0, r4, r4, 0x3f49db23 /* 0.788500 */
    +FADD.f32 r0:t1, t, r0.neg
    *FADD.f32 t0, r4, r4
    +IADD.s32 r2:t1, r2, 0x00000001 /* 0.000000 */
    *FMA.f32 r1:t0, t0, r1, #0.neg
    +JUMP t1, clause_6
}

clause_36:
ds(0) nbb ncph 
{
    *MOV.i32 t0, r3
    +S32_TO_F32 r0:t1, t
    *NOP t0
    +S32_TO_F32 t1, u0.w0
    *NOP t0
    +FRCP.f32 t1, t1
    *FMA.f32 r0:t0, r0, t1, #0.neg
    +NOP t1
}

clause_39:
ds(0) nbb ncph 
{
    *FMA.f32 r1:t0, 0x43960000 /* 300.000000 */, r0, #0.neg
    +NOP t1
    *FMA.f32 r2:t0, t0, 0x3f22f98c /* 0.636620 */, 0x49400000 /* 786432.000000 */
    +FADD.f32 t1, t, 0x49400000 /* 786432.000000 */.neg
    *FMA.f32 r1:t0, t1, 0xbfc90fd0 /* -1.570795 */, r1
    +FSIN_TABLE.u6 r3:t1, t0
    *FMA_RSCALE.f32 t0, t0, t0, #0.neg, 0xffffffff /* -nan */
    +FCOS_TABLE.u6 r2:t1, r2
    *FMA.f32 t0, t0, t1.neg, #0.neg
    +NOP t1
    *FMA.f32.clamp_m1_1 t0, r1, r3.neg, t0
    +NOP t1
    *NOP t0
    +FADD.f32 r1:t1, t0, r2
    *FMA.f32 r2:t0, 0x43f00000 /* 480.000000 */, r0, #0.neg
    +NOP t1
}

clause_46:
ds(0) nbb ncph next_attr 
{
    *FMA.f32 r3:t0, r2, 0x3f22f98c /* 0.636620 */, 0x49400000 /* 786432.000000 */
    +FADD.f32 t1, t, 0x49400000 /* 786432.000000 */.neg
    *FMA.f32 r2:t0, t1, 0xbfc90fd0 /* -1.570795 */, r2
    +FSIN_TABLE.u6 r4:t1, t0
    *FMA_RSCALE.f32 t0, t0, t0, #0.neg, 0xffffffff /* -nan */
    +FCOS_TABLE.u6 r3:t1, r3
    *FMA.f32 t0, t0, t1.neg, #0.neg
    +NOP t1
    *FMA.f32.clamp_m1_1 t0, r2, r4.neg, t0
    +NOP t1
    *FMA.f32 r0:t0, 0x44250000 /* 660.000000 */, r0, #0.neg
    +FADD.f32 r2:t1, t0, r3
    *FMA.f32 r3:t0, t0, 0x3f22f98c /* 0.636620 */, 0x49400000 /* 786432.000000 */
    +NOP t1
    *NOP t0
    +FADD.f32 r4:t1, t0, 0x49400000 /* 786432.000000 */.neg
}

clause_53:
ds(0) nbb attr ncph next_store dwb(0) 
{
    *FMA.f32 r0:t0, r4, 0xbfc90fd0 /* -1.570795 */, r0
    +FSIN_TABLE.u6 r4:t1, r3
    *FMA_RSCALE.f32 t0, t0, t0, #0.neg, 0xffffffff /* -nan */
    +FCOS_TABLE.u6 r3:t1, r3
    *FMA.f32 t0, t0, t1.neg, #0.neg
    +NOP t1
    *FMA.f32.clamp_m1_1 t0, r0, r4.neg, t0
    +NOP t1
    *NOP t0
    +FADD.f32 t1, t0, r3
    *FMA.f32 t0, t1, 0x3f000000 /* 0.500000 */, #0.neg
    +FADD.f32 t1, 0x3f000000 /* 0.500000 */, t.neg
    *MOV.i32 r3:t0, t1
    +MKVEC.v2i16 t1, r60, r61
    *DTSEL_IMM.attribute_1 t0, t1
    +LEA_ATTR_TEX.f32 t1, t, #0.x, #0.x, @r5
}

clause_60:
ds(0) eos store 
{
    *FMA.f32 t0, r1, 0x3f000000 /* 0.500000 */, #0.neg
    +FADD.f32 r1:t1, 0x3f000000 /* 0.500000 */, t.neg
    *FMA.f32 t0, r2, 0x3f000000 /* 0.500000 */, #0.neg
    +FADD.f32 r2:t1, 0x3f000000 /* 0.500000 */, t.neg
    *NOP t0
    +MOV.i32 r4:t1, 0x3f800000 /* 1.000000 */
    *NOP t0
    +ST_CVT.v4 t1, r5, r6, r7, @r1
}

shader14548 - MESA_SHADER_COMPUTE shader: 0 inst, 0 bundles, 0 quadwords, 0 registers, 4 threads, 0 loops, 0:0 spills:fills

r = grid_run(test, localx=[1,2,4,8,16,32,64,128,256], localy=[1,2,4,8,16,32,64,128,256])
grid_show(r, colorfun=lambda x: -np.log(x))